Concordance (Word Count)¶
Find the ten most commonly used words in a text file.
with open('paradise-lost.txt', encoding='utf-8-sig') as f:
text = ''.join(f)
# print(text[:50]) # 'The Project Gutenberg EBook of Paradise Lost, by J'
Method #1¶
def concordance(text):
freq = {}
for word in text.split():
if word not in freq:
freq[word] = 0
freq[word] += 1
return freq
print(concordance(text))
Output:
{'The': 475,
'Project': 84,
'Gutenberg': 25,
. . .
'2007]': 1,
'Language:': 1,
'English': 1,
'Character': 1,
'set': 36,
'encoding:': 1,
'ASCII': 1,
'***': 6,
. . .
Modifications: only words and all in lowercase¶
from re import sub
def concordance(text):
freq = {}
for word in text.split():
word = sub('[^\w]', '', word.lower()) # not words -> empty string
if word not in freq:
freq[word] = 0
freq[word] += 1
return freq
freq = concordance(text)
print(sorted(freq.items(), key=lambda kv: kv[1], reverse=True)[:10])
Output:
[('and', 3483),
('the', 3162),
('to', 2326),
('of', 2186),
('in', 1430),
('with', 1208),
('his', 1181),
('or', 795),
('that', 720),
('all', 712)]
Method #2 (initialize and assign in one step)¶
from re import sub
def concordance(text):
freq = {}
for word in text.split():
word = sub('[^\w]', '', word.lower())
freq[word] = freq.get(word, 0) + 1 # or freq.setdefault(word, 0) - initialize and assign
# freq.setdefault(word, 0) += 1 # does not work, we can't assign to the function call
return freq
freq = concordance(text)
print(sorted(freq.items(), key=lambda kv: kv[1], reverse=True)[:10])
Method #3 (defaultdict)¶
Template:
from collections import defaultdict
x = defaultdict(int)
x['asdfasdf']
x[1]
print(x) # defaultdict(int, {'asdfasdf': 0, 1: 0})
Code:
from re import sub
from collections import defaultdict
def concordance(text):
freq = defaultdict(int) # <--
for word in text.split():
word = sub('[^\w]', '', word.lower())
freq[word] += 1 # <--
return freq
freq = concordance(text)
print(sorted(freq.items(), key=lambda kv: kv[1], reverse=True)[:10])
Method #4 (Counter)¶
Template:
from collections import Counter
print(Counter('aabbbc')) # Counter({'a': 2, 'b': 3, 'c': 1}))
Code:
from re import sub
from collections import Counter
def concordance(text):
return Counter(sub('[^\w]', '', word.lower()) for word in text.split())
freq = concordance(text)
print(sorted(freq.items(), key=lambda kv: kv[1], reverse=True)[:10])
Output:
[('and', 3483),
('the', 3162),
('to', 2326),
('of', 2186),
('in', 1430),
('with', 1208),
('his', 1181),
('or', 795),
('that', 720),
('all', 712)]